---------------------------------------------------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /Users/clotairemit.edu/Dropbox (MIT)/J-PAL Raskin Transition/10_Analysis&Results/Agent Experiment Analysis/03_
> Logs/20240426_podes_clean.log
  log type:  text
 opened on:  26 Apr 2024, 00:04:13

. 
. 
. *** Import part 1 data
. u "$importable/podes2018_desa_part1_revisi.dta", clear

. 
. ** merge in part 3 data
. merge 1:1 R101 R102 R103 R104 using "$importable/podes2018_desa_part3_revisi.dta", assert(3) nogen

    Result                      Number of obs
    -----------------------------------------
    Not matched                             0
    Matched                            83,931  
    -----------------------------------------

. 
. ** merge in part 4 data
. merge 1:1 R101 R102 R103 R104 using "$importable/podes2018_desa_part4_revisi.dta", assert(3) nogen

    Result                      Number of obs
    -----------------------------------------
    Not matched                             0
    Matched                            83,931  
    -----------------------------------------

. 
. qui foreach var of varlist R101 R102 R103 R104 {

. 
. * number of households (sum of PLN, non-PLN, and non-electricity HHs)
. ////summ R501A1 R501A2 R501B
> egen num_hh_desa = rowtotal(R501A1 R501A2 R501B)

. //summ num_hh_desa
. 
. egen total_hh = total(num_hh_desa)

. format total_hh %12.0fc

. //summ total_hh
. di `r(mean)'
943.49061

. drop total_hh

. // 77,412,096 seems like too many households, is this families?
. 
. // Question 401 does not appear to be in the data
. //tab  R403A
. //tab  R403B
. 
. 
. * Gov Bank
. 
.   * Number
. 
. gen himbara_bank = R1208AK2 > 0 & R1208AK2 < 200

. 
. * Distance
. 
. gen dist_himbara_bank = R1208AK3
(8,950 missing values generated)

. replace dist_himbara_bank = 0 if himbara_bank == 1
(8,950 real changes made)

. 
. 
.   * Ease of reaching gov bank if none in village
. 
. //tab  R1208AK4
. gen veasy_reach_gov = R1208AK4 == 1

. replace veasy_reach_gov = 1 if himbara_bank == 1
(8,950 real changes made)

. gen easy_reach_gov = R1208AK4 == 2

. replace easy_reach_gov = 1 if veasy_reach_gov == 1
(19,889 real changes made)

. gen diff_reach_gov = R1208AK4 == 3

. replace diff_reach_gov = 1 if easy_reach_gov == 1
(66,021 real changes made)

. gen vdiff_reach_gov = R1208AK4 == 4

. replace vdiff_reach_gov = 1 if diff_reach_gov == 1
(78,098 real changes made)

. gen miss_reach_gov = R1208AK4 == .

. 
. 
. * Private Bank
. 
.   * N
. 
. gen private_bank = R1208BK2 > 0 // No NAs fine to use ineq

. 
.   * Distance
. 
. gen dist_private_bank = R1208BK3
(2,802 missing values generated)

. replace dist_private_bank = 0 if private_bank == 1
(2,802 real changes made)

. 
. 
. 
.   * Ease of reaching private bank if none in village
. 
. //tab  R1208BK4
. gen veasy_reach_priv = R1208BK4 == 1

. replace veasy_reach_priv = 1 if private_bank == 1
(2,802 real changes made)

. gen easy_reach_priv = R1208BK4 == 2

. replace easy_reach_priv = 1 if veasy_reach_priv == 1
(11,143 real changes made)

. gen diff_reach_priv = R1208BK4 == 3

. replace diff_reach_priv = 1 if easy_reach_priv ==1
(53,285 real changes made)

. gen vdiff_reach_priv = R1208BK4 == 4

. replace vdiff_reach_priv = 1 if diff_reach_priv ==1
(71,287 real changes made)

. gen miss_reach_priv = R1208BK4 == .

. 
. * ATM
. 
.   * N
. 
. gen atm = R1209CK2 == 1

. 
. * Distance
. 
. gen distance_atm = R1209CK3
(12,412 missing values generated)

. replace distance_atm = 0 if atm == 1
(12,412 real changes made)

. 
. 
. 
. * ease of reaching bank atm  if none in village
. 
. gen veasy_reach_atm = R1209CK4 == 1

. replace veasy_reach_atm = 1 if atm == 1
(12,412 real changes made)

. gen easy_reach_atm = R1209CK4 == 2

. replace easy_reach_atm = 1 if veasy_reach_atm == 1
(21,938 real changes made)

. gen diff_reach_atm = R1209CK4 == 3

. replace diff_reach_atm = 1 if easy_reach_atm == 1
(65,460 real changes made)

. gen vdiff_reach_atm = R1209CK4 == 4

. replace vdiff_reach_atm = 1 if diff_reach_atm == 1
(77,485 real changes made)

. gen miss_reach_atm = R1209CK4 == .

. 
. 
. * Bank agent in village
. 
.    * N
. 
. //tab  R1209GK2
. gen agent = R1209GK2 == 1

. 
.   * Distance
. // maybe the distances are top coded at 99? Should I just leave as is?
. 
. gen distance_agent = R1209GK3
(23,666 missing values generated)

. replace distance_agent = 0 if agent == 1
(23,666 real changes made)

. 
. 
. 
. * ease of reaching bank agent  if none in village
. //tab  R1209GK4
. gen veasy_reach_agent = R1209GK4 == 1

. replace veasy_reach_agent = 1 if agent == 1
(23,666 real changes made)

. gen easy_reach_agent = R1209GK4 == 2

. replace easy_reach_agent = 1 if veasy_reach_agent == 1
(31,764 real changes made)

. gen diff_reach_agent = R1209GK4 == 3

. replace diff_reach_agent = 1 if easy_reach_agent == 1
(66,756 real changes made)

. gen vdiff_reach_agent = R1209GK4 == 4

. replace vdiff_reach_agent = 1 if diff_reach_agent == 1
(76,723 real changes made)

. gen miss_reach_agent = R1209GK4 == .

. 
. 
. * Type g activity, number, distance, accessibility (h in 2018)
. 
. rename R1206HK2 typeg_n

. rename R1206HK3 typeg_km

. replace typeg_km = 0 if typeg_n == 1
(6,035 real changes made)

. 
. gen typeg_km_missing = typeg_km ==. // trick to avoid loosing observations

. replace typeg_km = 0 if typeg_km_missing == 1 //from controls later in analysis
(46,903 real changes made)

. 
. * Type f activity, number, distance, accessibility (G IN 2018)
. 
. rename R1206GK2 typef_n

. rename R1206GK3 typef_km

. replace typef_km = 0 if typef_n == 1
(3,348 real changes made)

. 
. gen typef_km_missing = typef_km ==. // trick to avoid loosing observations

. replace typef_km = 0 if typef_km_missing == 1 //from controls later in analysis
(6,052 real changes made)

. 
. * Type e activity, number, distance, accessibility
. 
. rename R1206EK2 typee_n

. rename R1206EK3 typee_km

. replace typee_km = 0 if typee_n == 1
(6,407 real changes made)

. 
. gen typee_km_missing = typee_km ==. // trick to avoid loosing observations

. replace typee_km = 0 if typee_km_missing == 1 //from controls later in analysis
(8,700 real changes made)

. 
. * Type j activity, number, distance, accessibility
. 
. rename R1206F1K2 typej_n

. rename R1206F1K3 typej_km

. replace typej_km = 0 if typej_n == 1
(4,824 real changes made)

. 
. gen typej_km_missing = typej_km ==. // trick to avoid loosing observations

. replace typej_km = 0 if typej_km_missing == 1 //from controls later in analysis
(71,261 real changes made)

. 
. * Pool e and g: food beverage shops and minimarkets together
. egen typege_n = rowtotal(typeg_n typee_n)

. 
. * dummy variable for rice-producing agriculture
. gen rice_agri = R403A == 1 & R403B == 1

. //summ rice_agri
. 
. * dummy variable for non-rice-producing agriculture
. gen non_rice_agri = R403A == 1 & R403B != 1

. //summ non_rice_agri
. 
. * dummy variable for non-agricultural village
. gen non_agri = R403A != 1

. //summ non_agri
. 
. * dummy for missing
. gen miss_agri = R403A == .

. //summ *agri
. 
. ** dummy variables for transport types
. //tab  R403C1
. 
. * asphalt road
. gen asphalt_road_transport = R403C1 == 1

. 
. * gravel road
. gen gravel_road_transport = R403C1 == 2

. 
. * land transport
. gen land_transport = R403C1 == 3

. 
. * water transport
. gen water_transport = R403C1 == 4

. 
. * other transport
. gen other_transport = R403C1 == 5

. 
. * missing transport type
. gen miss_transport = R403C1 == .

. 
. //summ *transport
. 
. ** road passability
. //tab  R403C2
. 
. * road passable throughout the year
. gen road_passable_always = R403C2 == 1

. 
. * road passable usually, except at certain times
. gen road_passable_usually = R403C2 == 2

. 
. * road passable in dry season
. gen road_passable_dry_season = R403C2 == 3

. 
. * road impassable
. gen road_passable_never = R403C2 == 4

. 
. * road passable missing
. gen road_passable_missing = R403C2 == .

. //summ road_passable*
. 
. * distance to camat
. // NOTE: There are many implausible values in this variable. There does not appear to be a clear cutoff,
. //       so for now I will winsorize to 1st and 99th percentiles
. //summ R1002AK5
. _pctile R1002AK5, p(1 99)

. gen distance_camat = R1002AK5

. replace distance_camat = `r(r1)' if distance_camat < `r(r1)'
(0 real changes made)

. replace distance_camat = `r(r2)' if distance_camat > `r(r2)' & distance_camat != .
(827 real changes made)

. recode distance_camat (. = 0)
(0 changes made to distance_camat)

. //summ distance_camat
. 
. // missing indicator
. gen distance_camat_miss = R1002AK5 == .

. //tab  distance_camat_miss
. 
. 
. * Total time to camat
. // Calculate total time in minutes
. gen time_to_camat_raw = R1002AME + (R1002AJM * 60)

. //summ time_to_camat_raw
. // winsorize to 1st and 99th percentile
. _pctile time_to_camat_raw, p(1 99)

. gen time_to_camat = time_to_camat_raw

. replace time_to_camat = `r(r1)' if time_to_camat < `r(r1)'
(152 real changes made)

. replace time_to_camat = `r(r2)' if time_to_camat > `r(r2)' & time_to_camat != .
(728 real changes made)

. recode time_to_camat (. = 0)
(0 changes made to time_to_camat)

. 
. // missing indicator
. gen time_to_camat_miss = time_to_camat_raw == .

. //tab  time_to_camat_miss
. drop time_to_camat_raw

. 
. * Fees to travel to camat
. //summ R1002AK7
. gen fees_camat = R1002AK7

. _pctile fees_camat, p(1 99)

. replace fees_camat = `r(r1)' if fees_camat < `r(r1)'
(0 real changes made)

. replace fees_camat = `r(r2)' if fees_camat > `r(r2)' & fees_camat != .
(760 real changes made)

. recode fees_camat (. = 0)
(0 changes made to fees_camat)

. 
. // missing indicator
. gen fees_camat_miss = R1002AK7 == .

. //tab  fees_camat_miss
. 
. * cell phone quality in most urban and rural areas
. //tab  R1005C
. gen vstrong_cell_signal = R1005C == 1

. gen strong_cell_signal = R1005C == 2

. gen weak_cell_signal = R1005C == 3

. gen no_cell_signal = R1005C == 4

. gen miss_cell_signal = R1005C == .

. //summ *cell_signal
. 
. * internet signal quality in most areas in the village
. //tab  R1005D
. gen internet_cell_4G = R1005D == 1

. gen internet_cell_3G = R1005D == 2

. gen internet_cell_25G = R1005D == 3

. gen internet_cell_none = R1005D == 4

. gen internet_cell_miss = R1005D == .

. //summ internet*
. 
. * hetero vars for internet/cell signal
. gen hi_phone_sig = R1005C == 1 | R1005C == 2

. gen hi_int_sig = R1005D == 1 | R1005D == 2

. //summ hi*
. 
. * land under rice cultivation is missing!
. 
. 
. ** heterogeneity: travel time to capital (minutes)
. gen time_camat = R1002AME + (R1002AJM * 60)

. gen time_bupati = R1002BME + (R1002BJM * 60)

. gen time_other_camat = R1002CME + (R1002CJM * 60)

. gen time_other_bupati = R1002DME + (R1002DJM * 60)

. //summ time_camat time_other_camat time_bupati time_other_bupati
. count if time_other_camat < time_camat
  10,856

. count if time_bupati > time_other_bupati
  16,176

. 
. * min of travel times
. gen min_time_camat = min(time_camat, time_other_camat)

. gen min_time_bupati = min(time_bupati, time_other_bupati)

. 
. * windsorize to 99.5 pct
. gen min_time_camat_wins = min_time_camat

. _pctile min_time_camat, p(0.5 99.5)

. replace min_time_camat_wins = `r(r1)' if min_time_camat < `r(r1)'
(189 real changes made)

. replace min_time_camat_wins = `r(r2)' if min_time_camat > `r(r2)' & min_time_camat != .
(388 real changes made)

. //summ min_time_camat_wins min_time_camat
. 
. gen min_time_bupati_wins = min_time_bupati

. _pctile min_time_bupati, p(0.5 99.5)

. replace min_time_bupati_wins = `r(r1)' if min_time_bupati < `r(r1)'
(272 real changes made)

. replace min_time_bupati_wins = `r(r2)' if min_time_bupati > `r(r2)' & min_time_bupati != .
(417 real changes made)

. //summ min_time_bupati_wins min_time_bupati
. 
. drop time_camat time_other_camat time_bupati time_other_bupati min_time_camat min_time_bupati

. rename *_wins *

. //summ min*
. 
. // village head education level
. //tab  R1701AK5
. gen desa_head_edu18 = R1701AK5
(3,433 missing values generated)

. 
. // we will add education of head HH at baseline so we can apply the 0 replacement and dummy for NAs
.   gen desa_head_edu18_missing = R1701AK5 == .

.   replace desa_head_edu18 = 0 if R1701AK5 == .
(3,433 real changes made)

. 
. 
. 
. 
. 
. ** heterogeneity: road conditions
. //tab  R1001A R1001B1
. gen nonasphalt = R1001B1 != 1

. //tab  nonasphalt
. 
. //tab  R1001B2
. gen roadimpass = R1001B2 != 1

. //tab  roadimpass
. 
.     // Number of each branch of bank A,B,C types is all 0 and no bank agent is present in village
. gen no_bankaccess = ((R1208AK2 == 0) & (R1208BK2 == 0) & (R1208CK2 == 0) & (R1209GK2 == 2))

. //tab  no_bankaccess
. 
. 
. 
. ** save
.   keep  R101 R102 R103 R104 *gov* *agent* *atm* *himbara* *priv* typej* typeg* typef* typee* *agri *transport road_passab
> le* roadimpass nonasphalt distance_camat* time_to_camat* fees_camat* *cell* min* num_hh_desa no_bankaccess *desa_head* hi
> * desa_head_edu18 desa_head_edu18_missing

.   foreach var of varlist *agri *transport road_passable* distance_camat* time_to_camat* fees_camat* *cell* distance_agent
>  agent atm distance_atm *himbara* *private* desa_head_edu18 desa_head_edu18_missing no_bankaccess {
  2.     rename `var' `var'_podes
  3.   }

. 
.   di c(k)
86

.   foreach var of varlist *podes {
  2.     assert `var' != .
  3.   }

.   foreach var of varlist  typeg* typef* typee* typej* {
  2.     rename `var' `var'_podes18
  3.   }

. 
.   save "$cleaned/podes_2018.dta", replace
file /Users/clotairemit.edu/Dropbox (MIT)/J-PAL Raskin Transition/10_Analysis&Results/Agent Experiment
    Analysis/01_Data/cleaned/podes_2018.dta saved

.   //summ roadimpass nonasphalt
. 
. 
. 
. /*----------------------------------------------------*/
.                   /* Section: PODES 2019 */
. /*----------------------------------------------------*/
. 
. u "$importable/podes_2019_b1b13.dta", clear

. 
. ** Small buisnesses recovering from set of various_activities
. 
. * Type g activity, number, distance, accessibility (h in 2018)
. 
. rename R801GK2 typeg_n

. rename R801GK3 typeg_km

. gen typeg_veasy = R801GK4 == 1

. replace typeg_veasy =. if R801GK4==.
(0 real changes made)

. gen typeg_easy = R801GK4 == 2

. replace typeg_easy =. if R801GK4==.
(0 real changes made)

. gen typeg_hard = R801GK4 == 3

. replace typeg_hard =. if R801GK4==.
(0 real changes made)

. gen typeg_vhard = R801GK4 == 4

. replace typeg_vhard =. if R801GK4==.
(0 real changes made)

. 
. * Type f activity, number, distance, accessibility (G IN 2018)
. 
. rename R801FK2 typef_n

. rename R801FK3 typef_km

. gen typef_veasy = R801FK4 == 1

. replace typef_veasy =. if R801FK4==.
(0 real changes made)

. gen typef_easy = R801FK4==2

. replace typef_easy =. if R801FK4==.
(0 real changes made)

. gen typef_hard = R801FK4 == 3

. replace typef_hard =. if R801FK4==.
(0 real changes made)

. gen typef_vhard = R801FK4 == 4

. replace typef_vhard =. if R801FK4==.
(0 real changes made)

. 
. 
. * Type e activity, number, distance, accessibility
. 
. rename R801EK2 typee_n

. rename R801EK3 typee_km

. gen typee_veasy = R801EK4 == 1

. replace typee_veasy =. if R801EK4==.
(0 real changes made)

. gen typee_easy = R801EK4 == 2

. replace typee_easy =. if R801EK4==.
(0 real changes made)

. gen typee_hard = R801EK4 == 3

. replace typee_hard =. if R801EK4==.
(0 real changes made)

. gen typee_vhard = R801EK4 == 4

. replace typee_vhard =. if R801EK4==.
(0 real changes made)

. 
. 
. * Type j activity, number, distance, accessibility
. 
. rename R801JK2 typej_n

. rename R801JK3 typej_km

. gen typej_veasy = R801JK4 == 1

. replace typej_veasy =. if R801JK4==.
(0 real changes made)

. gen typej_easy = R801JK4 == 2

. replace typej_easy =. if R801JK4==.
(0 real changes made)

. gen typej_hard = R801JK4 == 3

. replace typej_hard =. if R801JK4==.
(0 real changes made)

. gen typej_vhard = R801JK4 == 4

. replace typej_vhard =. if R801JK4==.
(0 real changes made)

. 
. * Pooling activties of types g and e
. egen typege_n = rowtotal(typeg_n typee_n)

. 
. gen no_bankaccess = ((R802A == 0) & (R802B == 0) & (R802C == 0) )

. gen bankaccess = 1 - no_bankaccess

. 
. foreach var of varlist bankaccess {
  2.   rename `var' `var'_podes19
  3. }

. 
. *** new village head
. ** merge in 2018 (baseline) versions of variables
.   qui foreach var of varlist R10? {

.   merge 1:1 R101 R102 R103 R104 using "$cleaned/podes_2018.dta", keepusing(*_podes18)

    Result                      Number of obs
    -----------------------------------------
    Not matched                           498
        from master                       252  (_merge==1)
        from using                        246  (_merge==2)

    Matched                            83,685  (_merge==3)
    -----------------------------------------

.   // NOTE: going forward, will need to create crosswalk bewteen PODES 2018 and 2019
.   drop if _m == 2
(246 observations deleted)

.   rename _m merge18

. 
. ** save
.   keep R101 R102 R103 R104  typeg* typef* typee* typej* bankaccess_podes19

.   //summ
. 
. 
.   save "$cleaned/podes_2019.dta", replace
file /Users/clotairemit.edu/Dropbox (MIT)/J-PAL Raskin Transition/10_Analysis&Results/Agent Experiment
    Analysis/01_Data/cleaned/podes_2019.dta saved

. 
. cap log close
